Notes to this new file: (March 3, 2024)

Data cleaning and Preparation

timestamp() 
## ##------ Thu Mar 14 08:07:34 2024 ------##
# 0. record datasets ----
## 0.1 initial value setup ----
freq = 12 # the frequency of the data <- 12 for monthly; 4 for quarterly; 1 for annually
start.ym = as.yearmon(1966) # -1/12 # the starting time
end.ym = as.yearmon(2020) # the ending time 

month_select <- function(freq = freq) { # return the months for differnt time frequency
  if (freq == 12) {
    return(c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November","December"))
  }
  if (freq == 4) {
    return(c("March", "June", "September", "December"))
  }
  if (freq == 1) {
    return("December")
  }
}
freq_name <- function(freq = freq) {
  if (freq == 12) {
    return("monthly")
  }
  if (freq == 4) {
    return("quarterly") 
  }
  if (freq == 1) {
    return("annual")
  }
}

## 0.2 preliminary data cleaning ---- 
# "Variables_DK_CRSP_monthly_196601_201912.csv"
# "PredictorDataDK_CRSPm.csv"
# "PredictorDataDK_monthly_all.csv" - DK replication 
port_market <- read.csv("/Users/hongyixu/Library/CloudStorage/OneDrive-HandelshögskolaniStockholm/Xu_Katselas_Drienko._2021/R&R_JEF_PredictorDataDk_Dec3_2023/Variables_DK_CRSP_monthly_196601_201912.csv") %>% 
  as_tibble() %>% 
  mutate(E12 = ((E12*(E12 > 0) + 0.001*(E12 <= 0))) * 12/freq) %>% 
  filter(E12 > 0) %>% 
  mutate(month = as.yearmon(as.character(yyyymm), format = "%Y%m"), 
         vwret = rollsumr(log(1+vwret), 12/freq, NA) ) %>% # converted into the log returns. 
  filter(months(month) %in% month_select(freq = freq)) %>% 
  filter(month < end.ym)

plot(y = port_market$E12, x = port_market$month, typew = "b"); abline(h = 0, col = 'red', lty = 2)

    #elif negative == 'truncate':
    #    df['E12'] = np.where(df['E12'] < 0, 0.001, df['E12'])

write.csv(x = port_market, file = "market_Allfirms.csv")
# port_market_raw <- port_market %>% select(month, vwret, R, date, GM, GE, DP)

ports_all <- read.csv("/Users/hongyixu/Library/CloudStorage/OneDrive-HandelshögskolaniStockholm/Xu_Katselas_Drienko._2021/R&R_JEF_PredictorDataDk_Dec3_2023/PredictorDataDK_szbm_CRSPm.csv") %>% 
  # PredictorDataDK_port_CRSPm.csv
  # PredictorDataDK_szbm_CRSPm.csv
  as_tibble() %>%
  mutate(E12 = ((E12*(E12 > 0) + 0.001*(E12 <= 0))) * 12/freq) %>%  
  filter(E12 > 0) %>% 
  mutate(month = as.yearmon(as.Date(date, format = "%Y-%m-%d")),
         port = as.character(port), 
         vwret = rollsumr(log(1+vwret), 12/freq, NA) ) %>% 
  filter(months(month) %in% month_select(freq = freq)) %>% 
  filter(month < end.ym) 

ports <- unique(ports_all$port) # identifiers for portfolios
for (p in ports) {
  ports.dt <- ports_all %>%
    filter(port == p) %>%
    arrange(month) 
  ports.dt <- ports.dt[, -1] # remove the column `port` 
  write.csv(x = ports.dt, file = paste("port_", p, ".csv", sep = ""))
}

## 0.3 name portfolios and predictors ----
market.names <- list.files(pattern = "market_")[1]
data.names <- list.files(pattern = "^port_") # data for portfolios
  ## generate the name of portfolios 
  ## Define the two sets
  #set1 <- c("B", "S")
  #set2 <- c("H", "M", "L")
  # Use expand.grid() to generate all combinations
  #combinations <- expand.grid(set2, set1)[, c(2, 1)]
  ## Convert the result to a vector
  # ports <- do.call(paste0, combinations) ## ----updated March 7, 2024 
id.names <- c("Market", ports) # set plot names
ratio_names <- c("DP", "PE", "EY", "DY", "Payout") # potential predictors

## 0.4 risk-free rate -----
RF <- read.csv("/Users/hongyixu/Library/CloudStorage/OneDrive-HandelshögskolaniStockholm/Xu_Katselas_Drienko._2021/R&R_JEF_PredictorDataDk_Dec3_2023/Rfree_t30.csv") %>% # record the risk free rate
  as.tbl() %>% # as the average of the bid and ask.
  select(-X) %>%
  mutate(month = as.yearmon(month)) %>%
  filter(months(month) %in% month_select(freq = freq)) %>%
  filter(month < end.ym)

Figure 1 - Log Cumulative Index

Log cumulative realised portfolio return components for seven portfolios - the market portfolio and six size and book-to-market equity ratio sorted portfolios. All following figures demonstrate the monthly realised price-earnings ratio growth (gm), earnings growth (ge), dividend-price (dp) and the portfolio return index (r) with the values in January 1966 as zero for all portfolios.

# TABLE-1. summary statistics ----
TABLE1.uni <- list() # the univariate statistics
TABLE1.cor <- list() # the correlation matrixs

PE.df <- data.frame(month = port_market$month[port_market$month >= start.ym])
EY.df <- data.frame(month = port_market$month[port_market$month >= start.ym])
DP.df <- data.frame(month = port_market$month[port_market$month >= start.ym])

## (1*) summary tables for Summary & Correlations ----
for (c in 1:7) {
  id <- c(market.names, data.names)[c]
  # print(id); print(id.names[c])
  
  ## 1. read the data ---- 
  data_nyse <- read.csv(id) %>%
    as_tibble() %>%
    mutate(month = as.yearmon(month)) %>%
    filter(month >= start.ym) %>% # start from "Dec 1965" 
    select(month, r = vwret, P = Index, E = E12, D = D12, pe_raw = PE) %>% 
    # , R, GM_raw = GM, GE_raw = GE, DP_raw = DP
    mutate(DP = D / P, # these are adjusted by the log transformation
           PE = P / E,
           EP = E / P,
           EY = E / lag(P), # earnings yield
           DY = D / lag(P), # dividend yield
           Payout = D / E) # payout ratios
  
  PE.df <- cbind.data.frame(PE.df, data_nyse$PE)
  EY.df <- cbind.data.frame(EY.df, data_nyse$EY)
  DP.df <- cbind.data.frame(DP.df, data_nyse$DP)
  
  ## 2. return decomposition ----
  data_decompose <- data_nyse %>% 
    mutate(r = r, # cts returns = log total returns 
           gm = log(PE) - lag(log(PE)), # multiple expansion rate
           ge = log(E) - lag(log(E)), # earnings growth rate
           dp = log(1 + DP/freq)) %>% # only 1/12 of the dividends -----updated March 7, 2024 
    na.omit()
  
  ## 3. summary-Stat ----
  ar1.coef <- function(x) {
    return(as.numeric(lm(x ~ lag(x))$coefficients[2]))
  } # return the function value of the coefficient for the AR(1) model
  
  comp_summary.dt <- data_decompose %>%
    select(gm, ge, dp, r) %>%
    # , R, GM_raw, GE_raw, DP_raw 
    describe() %>%
    mutate(mean = mean * 100,
           sd = sd * 100,
           median = median * 100,
           min = min * 100,
           max = max * 100) %>%
    select(Mean = mean, Median = median, SD = sd, Min = min, Max = max, Skew = skew, Kurt = kurtosis) %>%
    round(digits = 4)
  
  comp_summary.dt$"AR(1)" <- data_decompose %>%
    select(gm, ge, dp, r) %>%
    apply(2, ar1.coef) %>%
    round(digits = 4)
  
  ### Store the summary stat
  # print(paste("Data starts from ", first(data_decompose$month), " and ends in ", last(data_decompose$month), ".", sep = ""))
  TABLE1.uni[[id.names[c]]] <- comp_summary.dt
  
  ## 4. correlations ----
  comp_cor <- data_decompose %>% select(gm, ge, dp, r) %>% cor()
  TABLE1.cor[[id.names[c]]] <- comp_cor
  
  # Figure-1. cumulative realised return components ---- 
  # jpeg(filename = paste("Figure1_", id.names[c], ".jpeg", sep = ""), width = 550, height = 350)
  par(mar = c(2, 4, 2, 1))
  cum_components.ts <- data_decompose %>%
    select(r, gm, ge, dp) %>%
    apply(2, cumsum) %>%
    ts(start = data_decompose$month[1], frequency = freq)
  plot.ts(cum_components.ts, plot.type = "single", lty = 1:4, main = id.names[c], cex.main = 1, 
          xlab = NULL, ylab = "Cumulative Return and Components Indices")
  legend("topleft",
         legend = c("Total return", "Price earnings growth",
                    "Earnings growth", "Dividend price"),
         lty = 1:4,
         cex = 1.0) # text size
  # dev.off()
  par(mar = c(5, 4, 4, 2) + 0.1)
}

write.csv(TABLE1.uni, file = "table_1.uni.csv")
write.csv(TABLE1.cor, file = "table_1.cor.csv")

.

Table 1 - Summary statistics of returns components

The correlations between gm and ge might be a bit too high comparing to Ferreira and Santa-Clara (2011). Need to check the code again.

‘kable’ for Table Creation

## summary data for table1
rowname <- rep(colnames(TABLE1.cor$Market), length(names(TABLE1.uni)))
portname = rep(names(TABLE1.uni), each = length(colnames(TABLE1.cor$Market)))

table1 <- do.call(rbind.data.frame, TABLE1.uni) %>% 
  cbind.data.frame(do.call(rbind.data.frame, TABLE1.cor)) %>%
  round(digits = 2) %>%
  cbind.data.frame(rowname, portname)

## give the table 1 outputs
gt(data = table1, rowname_col = "rowname", groupname_col = "portname") %>%
  tab_header(title = "Table 1 - Summary statistics of returns components",
             subtitle = paste(freq_name(freq = freq), " data starts from ", first(data_decompose$month), " and ends in ", last(data_decompose$month), ".", sep = "")) %>%
  tab_spanner(label = "Panel A: univariate statistics",
              columns = vars(Mean, Median, SD, Min, Max, Skew, Kurt, "AR(1)")) %>%
  tab_spanner(label = "Panel B: Correlations",
              columns = vars(gm, ge, dp, r)) %>%
  tab_source_note(source_note = html("Note: Panel A in this table presents mean, median, standard deviation (SD), minimum, maximum, skewness (Skew), kurtosis (kurt) and first-order autocorrelation coefficient of the realised components of stock market returns and six size and book-to-market equity ratio sorted portfolios. These univariate statistics for each portfolios are presented separately. gm is the continuously compounded growth rate in the price-earnings ratio. ge is the continuously compounded growth rate in earnings. dp is the log of one plus the dividend-price ratio. *r* is the portfolio returns. Panel B in this table reports correlation matrices for all seven portfolios. The sample period starts from Feburary 1966 and ends in December 2019."))
Table 1 - Summary statistics of returns components
monthly data starts from Feb 1966 and ends in Dec 2019.
Panel A: univariate statistics Panel B: Correlations
Mean Median SD Min Max Skew Kurt AR(1) gm ge dp r
Market
gm -0.04 0.20 51.14 -988.86 807.66 -5.06 309.28 0.12 1.00 -1.00 0.02 0.10
ge 0.76 0.90 50.89 -802.36 991.68 5.31 315.21 0.12 -1.00 1.00 -0.02 -0.01
dp 0.23 0.22 0.08 0.08 0.46 0.35 -0.44 0.99 0.02 -0.02 1.00 -0.04
r 0.81 1.21 4.45 -25.57 15.35 -0.76 2.66 0.07 0.10 -0.01 -0.04 1.00
BH
gm 0.10 0.24 188.73 -1175.52 1168.88 0.00 26.08 -0.36 1.00 -1.00 0.01 0.05
ge 0.59 0.15 188.59 -1168.39 1168.39 0.00 26.09 -0.35 -1.00 1.00 -0.01 -0.03
dp 0.39 0.35 0.18 0.12 0.87 0.55 -0.58 0.98 0.01 -0.01 1.00 0.01
r 0.89 1.32 4.63 -24.01 19.29 -0.62 2.64 0.06 0.05 -0.03 0.01 1.00
BL
gm -0.01 0.15 11.05 -65.37 62.88 0.16 8.93 -0.35 1.00 -0.91 -0.05 0.41
ge 0.72 0.91 10.06 -60.98 63.95 -0.07 13.74 -0.45 -0.91 1.00 0.01 0.01
dp 0.19 0.17 0.06 0.06 0.39 0.66 -0.06 0.96 -0.05 0.01 1.00 -0.06
r 0.77 1.10 4.66 -26.53 19.64 -0.59 2.40 0.06 0.41 0.01 -0.06 1.00
BM
gm 0.03 0.19 67.00 -1117.36 1107.45 -0.21 232.54 -0.05 1.00 -1.00 0.02 -0.03
ge 0.64 0.90 67.27 -1117.29 1125.20 0.15 236.19 -0.04 -1.00 1.00 -0.02 0.09
dp 0.32 0.29 0.13 0.13 0.80 0.90 0.44 0.99 0.02 -0.02 1.00 -0.06
r 0.81 1.15 4.31 -22.04 16.35 -0.63 2.71 0.04 -0.03 0.09 -0.06 1.00
SH
gm 2.56 1.33 225.37 -1495.12 1441.72 0.06 25.42 -0.22 1.00 -1.00 0.01 0.06
ge -1.36 0.00 225.14 -1443.75 1486.75 -0.08 25.48 -0.22 -1.00 1.00 -0.02 -0.04
dp 0.48 0.43 0.24 0.17 1.97 3.14 13.52 0.99 0.01 -0.02 1.00 -0.11
r 1.16 1.57 5.58 -33.05 25.89 -0.84 4.02 0.16 0.06 -0.04 -0.11 1.00
SL
gm 2.25 -0.54 209.91 -1163.99 1282.51 0.17 20.88 -0.31 1.00 -1.00 0.03 -0.05
ge -1.22 0.45 210.28 -1275.66 1170.72 -0.17 20.89 -0.31 -1.00 1.00 -0.03 0.08
dp 0.18 0.15 0.13 0.03 0.92 2.62 8.99 0.97 0.03 -0.03 1.00 -0.04
r 0.65 1.24 6.86 -39.25 25.46 -0.76 2.91 0.14 -0.05 0.08 -0.04 1.00
SM
gm 0.02 -0.22 201.78 -1464.86 1463.12 -0.07 28.32 -0.35 1.00 -1.00 0.02 -0.05
ge 1.10 0.86 202.11 -1467.68 1470.85 0.07 28.43 -0.34 -1.00 1.00 -0.02 0.08
dp 0.30 0.28 0.12 0.09 0.69 0.61 0.04 0.96 0.02 -0.02 1.00 -0.08
r 1.06 1.50 5.49 -32.57 23.59 -0.84 3.54 0.14 -0.05 0.08 -0.08 1.00
Note: Panel A in this table presents mean, median, standard deviation (SD), minimum, maximum, skewness (Skew), kurtosis (kurt) and first-order autocorrelation coefficient of the realised components of stock market returns and six size and book-to-market equity ratio sorted portfolios. These univariate statistics for each portfolios are presented separately. gm is the continuously compounded growth rate in the price-earnings ratio. ge is the continuously compounded growth rate in earnings. dp is the log of one plus the dividend-price ratio. *r* is the portfolio returns. Panel B in this table reports correlation matrices for all seven portfolios. The sample period starts from Feburary 1966 and ends in December 2019.

Figure 3 - Cumulative OOS R-sqaure Difference and Cumulative SSE Difference

# TABLE-2. different tables OOS R-square ----
TABLE2 <- list()
table2.df <- data.frame()

## (3*) repeat for each portfolio ----
c <- 0
for (id in c(market.names, data.names)) {
  c <- c + 1
  print(id); print(id.names[c])
  
  ## 1. read the data ----
  data_nyse <- read.csv(id) %>%
    as.tbl() %>%
    mutate(month = as.yearmon(month)) %>%
    filter(month >= start.ym) %>% # start from "Dec 1965"
    select(month, r = vwret, P = Index, E = E12, D = D12) %>%
    mutate(DP = D / P, # construct predictors
           PE = P / E,
           EP = E / P,
           EY = E / lag(P), # earnings yield
           DY = D / lag(P), # dividend yield
           Payout = D / E) # payout ratios
  
  ## 2. return decomposition ----
  data_decompose <- data_nyse %>% # also try PD ratio replacing PE.
    mutate(r = r, # cts returns (has already being the log return in row 95)
           gm = log(PE) - lag(log(PE)), # multiple expansion rate
           ge = log(E) - lag(log(E)), # earnings growth rate
           dp = log(1 + DP/freq)) %>% # see note 1.
    # only 1/12 of the annualised dividend is included. > see note 1.
    na.omit() 
  
  ## 3. SOP predictions ---- 
  k = freq * 20 # set a 20-year rolling window
  data_pred <- data_decompose %>%
    select(month, r, gm, ge, dp) %>%
    mutate(mu_gm = 0,
           mu_ge1 = rollmeanr(ge, k, fill = NA_real_), # rolling mean 
           mu_ge2 = c(rep(NA_real_, (k-1)), cummean(ge)[k:length(ge)]), # recursive mean
           mu_ge3 = cummean(ge), # recursive mean from the beginning 
           a_DK1 = rollmeanr(r - dp, k, fill = NA_real_), # methods Eq (14/15) by DK 
           a_DK2 = cummean(r - dp), # methods Eq (14/15) by DK 
           mu_dp = dp,
           mu_sop = mu_gm + mu_ge1 + mu_dp) %>% # the predictor > see note 2.
    mutate(sop_simple = lag(mu_sop), # conditional predictions
           hist_mean = (cummean(r)) ) # historical mean predictions
  
  ## 4. OOS R2 and MSE-F ----
  ### build the function for the bootstrap
  Boot_MSE.F <- function(data, actual, cond, uncond, x, critical.values = TRUE, boot.times = 10000) {
    ## clean and reorganise the data
    ports.dt <- data %>%
      select(month, actual = actual, cond = cond, uncond = uncond, x = x)
    
    ports.dt_narm <- ports.dt %>%
      na.omit() %>%
      mutate(error_A = actual - cond,
             error_N = actual - uncond,
             Cuml_MSE_A = cummean(error_A ^ 2),
             Cuml_MSE_N = cummean(error_N ^ 2),
             Cuml_OOS.rsquared = 1 - Cuml_MSE_A / Cuml_MSE_N,
             Cum_SSE = cumsum(error_N ^2 - error_A ^ 2) )
    ### for the full (out-of-)sample
    MSE_A <- mean(ports.dt_narm$error_A ^ 2)
    MSE_N <- mean(ports.dt_narm$error_N ^ 2)
    OOS_r.squared <- 1 - MSE_A / MSE_N # out-of-sample R square
    MSE_F <- length(ports.dt_narm$month) * (MSE_N - MSE_A) / (MSE_A) 
    MAE_A <- mean(abs(ports.dt_narm$error_A)) %>% round(digits = 6) # Mean absolute error of the conditional model
    print(paste("OOS R Squared: ", round(OOS_r.squared, digits = 4), sep = ""))
    print(paste("MSE-F: ", round(MSE_F, digits = 4), sep = ""))
    
    Cuml_OOS.ts <- ts(ports.dt_narm$Cuml_OOS.rsquared, start = ports.dt_narm$month[1], frequency = freq)
    Cum_SSE.ts <- ts(ports.dt_narm$Cum_SSE, start = ports.dt_narm$month[1], frequency = freq)
    
    ## Bootstrapped MSE-F Stat
    if (critical.values == TRUE) {
      ## get the data series for x and y
      y0 <- ports.dt[is.na(ports.dt$cond), ]$actual
      y1 <- ports.dt[!is.na(ports.dt$cond), ]$actual
      x1 <- as.vector(na.omit(ports.dt$x))
      
      ## full sample estimation for the model
      alpha <- mean(y1)
      u1 <- y1 - alpha
      
      x.lm <- lm(x1 ~ lag(x1))
      mu <- as.numeric(coef(x.lm)[1])
      rho <- as.numeric(coef(x.lm)[2])
      u2 <- as.numeric(x.lm$residuals)
      
      u <- data.frame(index = 1:length(u1), u1, u2) # the dataset storing all the residual info
      
      ### bootstrapping pairs of error terms
      boot.critical <- c()
      for (i in 1:boot.times) { # the bootstrapped times can be modified
        index.boot <- sample(u$index, length(u1), replace = T)
        u.boot <- data.frame()
        for (j in index.boot) {
          u.boot <- rbind.data.frame(u.boot, u[j,])
        } # record the bootstrapped error terms
        
        ### reconstruct the simulated x and y
        y1.new <- alpha + u.boot$u1
        x0.new <- sample(x1, 1) # resample a value as the starting value of x
        x1.new <- c(mu + rho * x0.new + u.boot$u2[1])
        for (j in 2:length(y1.new)) {
          x1.new[j] <- mu + x1.new[j-1] * rho + u.boot$u2[j]
        } # simulate SOP values
        
        ### redo the rolling estimation
        y.boot <- c(y0, y1.new)
        x.boot <- c(rep(NA_real_, (length(y0) - 1)), x0.new, x1.new)
        
        data.dt <- data.frame(month = ports.dt$month, x.boot, y.boot) %>%
          as.tbl() %>%
          mutate(conditional = lag(x.boot), # convert to the SOP prediction
                 unconditional = lag(cummean(y.boot))) %>% # convert to the historical mean prediction
          na.omit()
        
        error_N.boot = data.dt$y.boot - data.dt$unconditional
        error_A.boot = data.dt$y.boot - data.dt$conditional
        MSE_N.boot = mean(error_N.boot ^ 2) 
        MSE_A.boot = mean(error_A.boot ^ 2) 
        OOS_r.squared.boot <- 1 - MSE_A.boot / MSE_N.boot %>% round(digits = 6) # out-of-sample R square
        
        ### MSE-F statistic
        MSE_F.boot <- length(error_N.boot) * (MSE_N.boot - MSE_A.boot) / (MSE_A.boot) %>% round(digits = 6)
        
        boot.critical[i] <- MSE_F.boot
        
        if (i %% (boot.times/10) == 0) {
          timestamp()
          print(paste("SOP", ": ", i %/% (boot.times/100), "%", sep = ""))
        }
      }
      ## store the results
      result <- cbind.data.frame(IS_r.squared = NA_real_,
                                 OOS_r.squared,
                                 MAE_A, # MAE of conditional model
                                 MSE_F,
                                 t(quantile(boot.critical, probs = c(0.90, 0.95, 0.99))),
                                 p.value = mean(boot.critical > MSE_F))
      # d_RMSE = round(d_RMSE, digits = 4),
      # DM_stat = round(DM_test.result$statistic, digits = 4),
      # DM_pval = round(DM_test.result$p.value, digits = 4))
    } else {
      result <- cbind.data.frame(IS_r.squared = NA_real_,
                                 OOS_r.squared,
                                 MAE_A, # MAE of conditional model
                                 MSE_F)
    }
    
    rownames(result) <- "SOP"
    output <- list(result = result, Cuml_OOS.ts = Cuml_OOS.ts, Cum_SSE.ts = Cum_SSE.ts)
    return(output)
  }
  
  ### store the results for the SOP method
  sop.result <- Boot_MSE.F(data = data_pred, actual = "r", cond = "sop_simple", uncond = "hist_mean", x = "mu_sop", critical.values = FALSE, boot.times = 3000)
  sop.result$result
  
  par(mfrow = c(2,1))
  par(mar = c(3,5,3,2))
  plot.ts(sop.result$Cuml_OOS.ts * 100, xlab = NULL, ylab = "as %", main = paste(id.names[c], ": Cumulative OOS R Squared Difference - SOP", sep = ""))
  abline(h = c(0,1), lty = 2, col = 2)
  par(mar = c(5,5,3,2))
  plot.ts(sop.result$Cum_SSE.ts, ylab = NULL, cex.lab = 0.5,
          xlab = "An increase in a line indicates better performance of the conditional model",
          main = paste(id.names[c], ": Cumulative SSE Difference - SOP", sep = ""))
  abline(h = 0, lty = 2, col = 2)
  par(mfrow = c(1,1))
  
  ### store all cumulative OOS R2 difference values
  Cuml_all.ts <- sop.result$Cuml_OOS.ts * 100
  Csse_all.ts <- sop.result$Cum_SSE.ts
  
  ## 5. univariate predictive regressions ----
  table2.uni_predictors <- data.frame()
  
  for (predictor in ratio_names) {
    ## construct conditional & unconditional predictions
    data_univariate <- data_decompose %>%
      select(month, r, predictor) %>%
      mutate(hist_mean = lag(cummean(r)),
             x = lag(get(predictor)) %>% log) ## convert to log predictors
      # data_univariate[[predictor]] <- log(data_univariate[[predictor]])
      # data_univariate[["x"]] <- log(data_univariate[["x"]])
    
    ## IS R2
    lm.IS <- lm(r ~ x, data = data_univariate)
    IS_r.squared <- summary(lm.IS)$r.squared # in-sample r squared
    IS_pval <- summary(lm.IS)$coefficients[2,4] # the p-value from F-statistic
    
    ## OOS recursive window predictions
    k <- k # the starting in-sample data
    con_pred = rep(NA_real_, nrow(data_univariate))
    for (t in (k+2):nrow(data_univariate)) {
      x.IS <- data_univariate$x[2:(t-1)]
      y.IS <- data_univariate$r[2:(t-1)]
      reg.IS <- lm(y.IS ~ x.IS)
      
      x.new <- data_univariate$x[t]
      y.pred <- predict(reg.IS, newdata = data.frame(x.IS = x.new))
      con_pred[t] <- y.pred # store the prediction
    }
    data_univariate$con_pred <- con_pred
    data_univariate
    
    ## Stat and Bootstrap
    data = data_univariate
    actual = "r"
    cond = "con_pred"
    uncond = "hist_mean"
    critical.values = FALSE # decide whether the bootstrapped critical value is calculated > Note 3.
    boot.times = 1000 * 10
    
    { ## get OOS R2 & MSE-F
      ports.dt <- data %>%
        select(month, actual = actual, cond = cond, uncond = uncond, predictor) %>%
        rename(x = predictor)
      
      ports.dt_narm <- ports.dt %>%
        na.omit() %>%
        mutate(error_A = actual - cond,
               error_N = actual - uncond,
               Cuml_MSE_A = cummean(error_A ^ 2),
               Cuml_MSE_N = cummean(error_N ^ 2),
               Cuml_OOS.rsquared = 1 - Cuml_MSE_A / Cuml_MSE_N,
               Cum_SSE = cumsum(error_N ^2 - error_A ^ 2))
      MSE_A <- mean(ports.dt_narm$error_A ^ 2)
      MSE_N <- mean(ports.dt_narm$error_N ^ 2)
      OOS_r.squared <- 1 - MSE_A / MSE_N # out-of-sample R square
      MSE_F <- length(ports.dt_narm$month) * (MSE_N - MSE_A) / (MSE_A) 
      MAE_A <- mean(abs(ports.dt_narm$error_A)) # Mean absolute error of the conditional model
      print(paste("IS R Squared: ", round(IS_r.squared, digits = 4), sep = ""))
      print(paste("OOS R Squared: ", round(OOS_r.squared, digits = 4), sep = ""))
      print(paste("MSE-F: ", round(MSE_F, digits = 4), sep = ""))
      
      ### combine the cumulative OOS R2 difference of other predictors & cum SSE
      Cuml_pred.ts <- ts(ports.dt_narm$Cuml_OOS.rsquared * 100, start = ports.dt_narm$month[1], frequency = freq)
      Cuml_all.ts <- ts.union(Cuml_all.ts, Cuml_pred.ts)
      Cum_pred.sse <- ts(ports.dt_narm$Cum_SSE, start = ports.dt_narm$month[1], frequency = freq)
      Csse_all.ts <- ts.union(Csse_all.ts, Cum_pred.sse)
      
      ## prepare the bootstrap
      if (critical.values == TRUE) {
        y0 <- ports.dt$actual[1]
        y1 <- ports.dt$actual[-1]
        x1 <- ports.dt$x
        
        ## full sample estimation for the model
        alpha <- mean(y1)
        u1 <- y1 - alpha
        
        x.lm <- lm(x1 ~ lag(x1))
        mu <- as.numeric(coef(x.lm)[1])
        rho <- as.numeric(coef(x.lm)[2])
        u2 <- as.numeric(x.lm$residuals)
        
        u <- data.frame(index = 1:length(u1), u1, u2) # the dataset storing all the residual info
        
        ### bootstrap pairs of error terms
        boot.critical <- c()
        for (i in 1:boot.times) { # the bootstrapped times can be modified
          index.boot <- sample(u$index, length(u1), replace = T)
          u.boot <- data.frame()
          for (j in index.boot) {
            u.boot <- rbind.data.frame(u.boot, u[j,])
          } # record the bootstrapped error terms
          
          ### reconstruct the simulated x and y
          y1.new <- alpha + u.boot$u1
          x0.new <- sample(x1, 1) # resample a value as the starting value of x
          x1.new <- c(mu + rho * x0.new + u.boot$u2[1])
          for (j in 2:length(y1.new)) {
            x1.new[j] <- mu + x1.new[j-1] * rho + u.boot$u2[j]
          } # simulate predictors
          
          ### redo the rolling estimation
          y.boot <- c(y0, y1.new)
          x.boot <- c(x0.new, x1.new)
          
          data.dt <- as.tbl(data.frame(month = ports.dt$month, x.boot, y.boot, x = lag(x.boot)))
          con_pred = rep(NA_real_, nrow(data.dt))
          for (t in (k+2):nrow(data.dt)) {
            x.IS <- data.dt$x[2:(t-1)]
            y.IS <- data.dt$y.boot[2:(t-1)]
            reg.IS <- lm(y.IS ~ x.IS)
            
            x.new <- data.dt$x[t]
            y.pred <- predict(reg.IS, newdata = data.frame(x.IS = x.new))
            con_pred[t] <- y.pred
          }
          data.dt$conditional <- con_pred
          data.dt <- data.dt %>%
            mutate(unconditional = lag(cummean(y.boot))) %>%
            na.omit()
          
          error_N.boot = data.dt$y.boot - data.dt$unconditional
          error_A.boot = data.dt$y.boot - data.dt$conditional
          MSE_N.boot = mean(error_N.boot ^ 2)
          MSE_A.boot = mean(error_A.boot ^ 2)
          OOS_r.squared.boot <- 1 - MSE_A.boot / MSE_N.boot # out-of-sample R square
          
          ### MSE-F statistic
          MSE_F.boot <- length(error_N.boot) * (MSE_N.boot - MSE_A.boot) / (MSE_A.boot)
          
          boot.critical[i] <- MSE_F.boot
          
          if (i %% (boot.times/10) == 0) {
            timestamp()
            print(paste(predictor, ": ", i %/% (boot.times/100), "%", sep = ""))
          }
        }
        ## store the results
        result <- cbind.data.frame(IS_r.squared, 
                                   OOS_r.squared,
                                   MAE_A, # MAE of conditional model
                                   MSE_F,
                                   t(quantile(boot.critical, probs = c(0.90, 0.95, 0.99))),
                                   p.value = mean(boot.critical > MSE_F))
        # d_RMSE = round(d_RMSE, digits = 4),
        # DM_stat = round(DM_test.result$statistic, digits = 4),
        # DM_pval = round(DM_test.result$p.value, digits = 4))
      } else {
        result <- cbind.data.frame(IS_r.squared, 
                                   OOS_r.squared,
                                   MAE_A, # MAE of conditional model
                                   MSE_F)
      }
      rownames(result) <- predictor
      result
    }
    
    ## store the results for all predictors
    table2.uni_predictors <- rbind.data.frame(table2.uni_predictors, result)
  }
  table2.uni_predictors
  
  ## 6. statistics summary ----
  table2 <- rbind.data.frame(table2.uni_predictors, "SOP" = sop.result$result)
  if ("p.value" %in% colnames(table2)) {
    table2$star <- ifelse(table2$p.value <= 0.01, "***", ifelse(table2$p.value <= 0.05, "**", ifelse(table2$p.value <= 0.1, "*", "")))
  }
  # statistical significance from McCracken (2007)
  table2$McCracken <- ifelse(table2$MSE_F >= 3.838, "***", ifelse(table2$MSE_F >= 1.599, "**", ifelse(table2$MSE_F >= 0.685, "*", "")))
  
  table2
  TABLE2[[id.names[c]]] <- table2
  
  table2$rowname <- rownames(table2)
  table2$portname <- id.names[c]
  table2.df <- rbind.data.frame(table2.df, table2)
  
  ## 7. cumulative OOS R2 difference merged dataset ----
  colnames(Cuml_all.ts) <- c("SOP", ratio_names)
  par(mfrow = c(3,3))
  for (method in colnames(Cuml_all.ts)) {
    par(mar = c(2, 2.5, 2, 0.5))
    plot.ts(window(Cuml_all.ts[, method], start = 1990), xlab = NULL, lty = 2, ylab = NULL, main = method, cex.main = 0.8, xaxt = "n", las = 2)
    axis(1, seq(1990, 2020, by = 10), cex.axis = 0.8)
  }
  
  colnames(Csse_all.ts) <- c("SOP", ratio_names)
  par(mfrow = c(3,3))
  for (method in colnames(Csse_all.ts)) {
    par(mar = c(2, 2.5, 2, 0.5))
    plot.ts(Csse_all.ts[, method], xlab = NULL, lty = 2, ylab = NULL, main = method, cex.main = 0.8, xaxt = "n", las = 2)
    axis(1, seq(1990, 2020, by = 10), cex.axis = 0.8)
  }
  par(mfrow = c(1,1), mar = c(5, 4, 4, 2) + 0.1)
}
## [1] "market_Allfirms.csv"
## [1] "Market"
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(actual)` instead of `actual` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(cond)` instead of `cond` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(uncond)` instead of `uncond` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(x)` instead of `x` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## [1] "OOS R Squared: -0.0379"
## [1] "MSE-F: -14.852"
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(predictor)` instead of `predictor` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

## [1] "IS R Squared: 0.0083"
## [1] "OOS R Squared: -0.0135"
## [1] "MSE-F: -5.3965"
## [1] "IS R Squared: 5e-04"
## [1] "OOS R Squared: -0.0179"
## [1] "MSE-F: -7.1247"
## [1] "IS R Squared: 4e-04"
## [1] "OOS R Squared: -0.0203"
## [1] "MSE-F: -8.067"
## [1] "IS R Squared: 0.0097"
## [1] "OOS R Squared: -0.0138"
## [1] "MSE-F: -5.5158"
## [1] "IS R Squared: 0.0028"
## [1] "OOS R Squared: -0.0055"
## [1] "MSE-F: -2.2225"

## [1] "port_BH.csv"
## [1] "BH"
## [1] "OOS R Squared: -0.1251"
## [1] "MSE-F: -45.25"

## [1] "IS R Squared: 0.0114"
## [1] "OOS R Squared: -0.0021"
## [1] "MSE-F: -0.8315"
## [1] "IS R Squared: 0.0016"
## [1] "OOS R Squared: -0.0089"
## [1] "MSE-F: -3.5668"
## [1] "IS R Squared: 0.0017"
## [1] "OOS R Squared: -0.0088"
## [1] "MSE-F: -3.547"
## [1] "IS R Squared: 0.0129"
## [1] "OOS R Squared: 3e-04"
## [1] "MSE-F: 0.1056"
## [1] "IS R Squared: 6e-04"
## [1] "OOS R Squared: -0.0117"
## [1] "MSE-F: -4.6891"

## [1] "port_BL.csv"
## [1] "BL"
## [1] "OOS R Squared: -1e-04"
## [1] "MSE-F: -0.0586"

## [1] "IS R Squared: 0.0066"
## [1] "OOS R Squared: -0.0016"
## [1] "MSE-F: -0.6326"
## [1] "IS R Squared: 0.0074"
## [1] "OOS R Squared: 0.0015"
## [1] "MSE-F: 0.5969"
## [1] "IS R Squared: 0.0086"
## [1] "OOS R Squared: 0.0024"
## [1] "MSE-F: 0.97"
## [1] "IS R Squared: 0.0077"
## [1] "OOS R Squared: -0.0011"
## [1] "MSE-F: -0.4457"
## [1] "IS R Squared: 1e-04"
## [1] "OOS R Squared: -0.0045"
## [1] "MSE-F: -1.8376"

## [1] "port_BM.csv"
## [1] "BM"
## [1] "OOS R Squared: -0.0328"
## [1] "MSE-F: -12.9152"

## [1] "IS R Squared: 0.0025"
## [1] "OOS R Squared: -0.0177"
## [1] "MSE-F: -7.0796"
## [1] "IS R Squared: 0"
## [1] "OOS R Squared: -0.0513"
## [1] "MSE-F: -19.8056"
## [1] "IS R Squared: 0"
## [1] "OOS R Squared: -0.0513"
## [1] "MSE-F: -19.8285"
## [1] "IS R Squared: 0.0028"
## [1] "OOS R Squared: -0.0167"
## [1] "MSE-F: -6.6568"
## [1] "IS R Squared: 1e-04"
## [1] "OOS R Squared: -0.0484"
## [1] "MSE-F: -18.7299"

## [1] "port_SH.csv"
## [1] "SH"
## [1] "OOS R Squared: -0.5211"
## [1] "MSE-F: -139.4333"

## [1] "IS R Squared: 0.0015"
## [1] "OOS R Squared: -0.0182"
## [1] "MSE-F: -7.2402"
## [1] "IS R Squared: 3e-04"
## [1] "OOS R Squared: -0.0055"
## [1] "MSE-F: -2.2295"
## [1] "IS R Squared: 3e-04"
## [1] "OOS R Squared: -0.0054"
## [1] "MSE-F: -2.1623"
## [1] "IS R Squared: 0.0037"
## [1] "OOS R Squared: -0.023"
## [1] "MSE-F: -9.1349"
## [1] "IS R Squared: 2e-04"
## [1] "OOS R Squared: -0.0064"
## [1] "MSE-F: -2.5807"

## [1] "port_SL.csv"
## [1] "SL"
## [1] "OOS R Squared: -0.1544"
## [1] "MSE-F: -54.4467"

## [1] "IS R Squared: 0.0083"
## [1] "OOS R Squared: -0.0121"
## [1] "MSE-F: -4.837"
## [1] "IS R Squared: 0"
## [1] "OOS R Squared: -0.0127"
## [1] "MSE-F: -5.0936"
## [1] "IS R Squared: 0"
## [1] "OOS R Squared: -0.0153"
## [1] "MSE-F: -6.1138"
## [1] "IS R Squared: 0.0111"
## [1] "OOS R Squared: -0.0155"
## [1] "MSE-F: -6.2142"
## [1] "IS R Squared: 2e-04"
## [1] "OOS R Squared: -0.0228"
## [1] "MSE-F: -9.0543"

## [1] "port_SM.csv"
## [1] "SM"
## [1] "OOS R Squared: -0.1648"
## [1] "MSE-F: -57.5885"

## [1] "IS R Squared: 0.0037"
## [1] "OOS R Squared: -0.0126"
## [1] "MSE-F: -5.0385"
## [1] "IS R Squared: 0.003"
## [1] "OOS R Squared: -0.0033"
## [1] "MSE-F: -1.3546"
## [1] "IS R Squared: 0.0028"
## [1] "OOS R Squared: -0.0047"
## [1] "MSE-F: -1.8966"
## [1] "IS R Squared: 0.006"
## [1] "OOS R Squared: -0.0191"
## [1] "MSE-F: -7.5974"
## [1] "IS R Squared: 0.004"
## [1] "OOS R Squared: -9e-04"
## [1] "MSE-F: -0.3848"

Table 2 - Forecasts of portfolio returns

This table demonstrates the in-sample and out-of-sample R-squares for the market and six size and book-to-market equity ratio sorted portfolios from predictive regressions and the Sum-of-the-Parts method. IS R-squares are estimated using the whole sample period and the OOS R-squares are calculated compare the forecast error of the model against the historical mean model. The full sample period starts from Feb 1966 to December 2019 and the IS period is set to be 20 years with forecsats beginning in Feb 1986. The MSE-F statistics are calculated to test the hypothesis \(H_0: \text{out-of-sample R-squares} = 0\) vs \(H_1: \text{out-of-sample R-squares} \neq 0\).

Predictors here are all in log terms.

gt(table2.df, rowname_col = "rowname", groupname_col = "portname") %>%
  tab_header(title = "Table 2 - Forecasts of portfolio returns",
             subtitle = paste(freq_name(freq = freq), " data starts from ", first(data_decompose$month), " and ends in ", last(data_decompose$month), ".", sep = "")) %>%
  fmt_number(columns = 1:4, decimals = 6, suffixing = TRUE)
Table 2 - Forecasts of portfolio returns
monthly data starts from Feb 1966 and ends in Dec 2019.
IS_r.squared OOS_r.squared MAE_A MSE_F McCracken
Market
DP 0.008341 −0.013471 0.033688 −5.396537
PE 0.000497 −0.017862 0.033200 −7.124701
EY 0.000404 −0.020272 0.033270 −8.067006
DY 0.009736 −0.013773 0.033781 −5.515775
Payout 0.002776 −0.005504 0.032549 −2.222544
SOP NA −0.037873 0.033120 −14.851966
BH
DP 0.011372 −0.002052 0.035642 −0.831458
PE 0.001588 −0.008863 0.035109 −3.566768
EY 0.001669 −0.008814 0.035111 −3.547031
DY 0.012858 0.000260 0.035650 0.105551
Payout 0.000623 −0.011684 0.034990 −4.689096
SOP NA −0.125086 0.037888 −45.250032
BL
DP 0.006620 −0.001561 0.034211 −0.632646
PE 0.007436 0.001468 0.034144 0.596908
EY 0.008587 0.002383 0.034126 0.969995 *
DY 0.007719 −0.001099 0.034210 −0.445663
Payout 0.000060 −0.004547 0.034026 −1.837591
SOP NA −0.000144 0.033867 −0.058633
BM
DP 0.002479 −0.017747 0.032528 −7.079636
PE 0.000017 −0.051284 0.031991 −19.805606
EY 0.000026 −0.051346 0.031996 −19.828484
DY 0.002845 −0.016669 0.032531 −6.656812
Payout 0.000133 −0.048364 0.031498 −18.729904
SOP NA −0.032773 0.031677 −12.915206
SH
DP 0.001519 −0.018157 0.039478 −7.240162
PE 0.000271 −0.005522 0.039231 −2.229540
EY 0.000312 −0.005354 0.039237 −2.162327
DY 0.003696 −0.023018 0.039673 −9.134863
Payout 0.000199 −0.006397 0.039227 −2.580653
SOP NA −0.521116 0.053183 −139.433314
SL
DP 0.008256 −0.012057 0.049408 −4.837020
PE 0.000017 −0.012705 0.049646 −5.093602
EY 0.000006 −0.015289 0.049829 −6.113785
DY 0.011124 −0.015544 0.049476 −6.214239
Payout 0.000213 −0.022810 0.049548 −9.054271
SOP NA −0.154435 0.053362 −54.446666
SM
DP 0.003725 −0.012566 0.038938 −5.038514
PE 0.003001 −0.003348 0.038679 −1.354584
EY 0.002768 −0.004693 0.038714 −1.896646
DY 0.006001 −0.019070 0.039103 −7.597391
Payout 0.004041 −0.000949 0.038618 −0.384807
SOP NA −0.164816 0.042175 −57.588485